import warnings;
warnings.filterwarnings('ignore');
# Import libaries
import csv
import pandas as pd
from numpy.random import seed
from numpy.random import randn
from statsmodels.graphics.gofplots import qqplot
from matplotlib import pyplot
import matplotlib.pyplot as plt
import matplotlib
from numpy import exp
import numpy as np
import os
import math
import seaborn as sns
from sklearn import metrics
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix, roc_curve, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from pprint import pprint
from sklearn.model_selection import RandomizedSearchCV
# Load data from text file to dataframe
adult_df = pd.read_csv('adult_data.txt', sep = ",", header = None)
adult_df.columns = ["Age", "WorkClass", "fnlwgt", "Education", "EducationNum",
"MaritalStatus", "Occupation", "Relationship", "Race", "Gender",
"CapitalGain", "CapitalLoss", "HoursPerWeek", "NativeCountry", "Income"]
adult_df.head()
adult_df.shape
#Check nulls per column
adult_df.isnull().sum(axis = 0)
adult_df.dtypes
adult_cpy = adult_df.copy()
# List categories of WorkClass
labels = adult_cpy['WorkClass'].astype('category').cat.categories.tolist()
labels
# Strip the leading/trailing spaces from the WorkClass values
adult_cpy['WorkClass'] = adult_cpy['WorkClass'].str.strip()
# List categories of WorkClass
labels = adult_cpy['WorkClass'].astype('category').cat.categories.tolist()
labels
All rows with WorkClass = 'Never-worked' (7 total) have question marks in Occupation. Therefore, change these question marks to 'Never-worked' to create a new Occupation category.
# Show rows with 'Never-worked' in WorkClass column and question mark in Occupation column
nw = adult_cpy[(adult_cpy['WorkClass'] == 'Never-worked')]
nw.shape
nw
adult_cpy.Occupation = np.where(adult_cpy.WorkClass == 'Never-worked', 'Never-worked', adult_cpy.Occupation)
# Show rows with 'Never-worked' in WorkClass column
nw = adult_cpy[adult_cpy['WorkClass'] == 'Never-worked']
nw
freq = pd.crosstab(adult_cpy['WorkClass'], columns='count')
freq.head(len(labels))
# Show rows with '?' in WorkCLass column
WkClass = adult_cpy[adult_cpy['WorkClass'] == '?']
WkClass.head()
# Strip the leading/trailing spaces from the Education values
adult_cpy['Education'] = adult_cpy['Education'].str.strip()
# List categories of Education
labels = adult_cpy['Education'].astype('category').cat.categories.tolist()
labels
# Strip the leading/trailing spaces from the MaritalStatus values
adult_cpy['MaritalStatus'] = adult_cpy['MaritalStatus'].str.strip()
# List categories of MaritalStatus
labels = adult_cpy['MaritalStatus'].astype('category').cat.categories.tolist()
labels
# Strip the leading/trailing spaces from the Occupation values
adult_cpy['Occupation'] = adult_cpy['Occupation'].str.strip()
# List categories of Occupation
labels = adult_cpy['Occupation'].astype('category').cat.categories.tolist()
labels
freq = pd.crosstab(adult_cpy['Occupation'], columns='count')
freq.head(len(labels))
# Show rows with '?' in Occupation column
Occ = adult_cpy[adult_cpy['Occupation'] == '?']
Occ.head()
# Strip the leading/trailing spaces from the Relationship values
adult_cpy['Relationship'] = adult_cpy['Relationship'].str.strip()
# List categories of Relationship
labels = adult_cpy['Relationship'].astype('category').cat.categories.tolist()
labels
# Strip the leading/trailing spaces from the Race values
adult_cpy['Race'] = adult_cpy['Race'].str.strip()
# List categories of Race
labels = adult_cpy['Race'].astype('category').cat.categories.tolist()
labels
# Strip the leading/trailing spaces from the Gender values
adult_cpy['Gender'] = adult_cpy['Gender'].str.strip()
# List categories of Gender
labels = adult_cpy['Gender'].astype('category').cat.categories.tolist()
labels
# Strip the leading/trailing spaces from the NativeCountry values
adult_cpy['NativeCountry'] = adult_cpy['NativeCountry'].str.strip()
# List categories of NativeCountry
labels = adult_cpy['NativeCountry'].astype('category').cat.categories.tolist()
labels
freq = pd.crosstab(adult_cpy['NativeCountry'], columns='count')
freq.head(len(labels))
# Strip the leading/trailing spaces from the NativeCountry values
adult_cpy['Income'] = adult_cpy['Income'].str.strip()
# List categories of NativeCountry
labels = adult_cpy['Income'].astype('category').cat.categories.tolist()
labels
# Show rows with '?' in NativeCountry column
#NatCnty = adult_cpy[adult_cpy['NativeCountry'] == '?']
#NatCnty
For each variable, compare proportion of data in each of it's categories between the two datasets. If the proportion does not change between the full and partial datasets, then there is no pattern to the rows that contain '?' -- then delete the rows with '?' to create adult_known dataset.
# Remove rows with '?'
adult_known = adult_cpy[(adult_cpy['WorkClass'] != '?') & (adult_cpy['Occupation'] != '?') &
(adult_cpy['NativeCountry'] != '?')]
adult_known.shape
labels = adult_known['WorkClass'].astype('category').cat.categories.tolist()
freq = pd.crosstab(adult_known['WorkClass'], columns='proportion').apply(lambda r: r/len(adult_known), axis=1)
freq.head(len(labels))
labels = adult_cpy['WorkClass'].astype('category').cat.categories.tolist()
freq = pd.crosstab(adult_cpy['WorkClass'], columns='proportion').apply(lambda r: r/len(adult_cpy), axis=1)
freq.head(len(labels))
labels = adult_known['Education'].astype('category').cat.categories.tolist()
freq = pd.crosstab(adult_known['Education'], columns='proportion').apply(lambda r: r/len(adult_known), axis=1)
freq.head(len(labels))
labels = adult_cpy['Education'].astype('category').cat.categories.tolist()
freq = pd.crosstab(adult_cpy['Education'], columns='proportion').apply(lambda r: r/len(adult_cpy), axis=1)
freq.head(len(labels))
labels = adult_known['MaritalStatus'].astype('category').cat.categories.tolist()
freq = pd.crosstab(adult_known['MaritalStatus'], columns='proportion').apply(lambda r: r/len(adult_known), axis=1)
freq.head(len(labels))
labels = adult_cpy['MaritalStatus'].astype('category').cat.categories.tolist()
freq = pd.crosstab(adult_cpy['MaritalStatus'], columns='proportion').apply(lambda r: r/len(adult_cpy), axis=1)
freq.head(len(labels))
labels = adult_known['Occupation'].astype('category').cat.categories.tolist()
freq = pd.crosstab(adult_known['Occupation'], columns='proportion').apply(lambda r: r/len(adult_known), axis=1)
freq.head(len(labels))
labels = adult_cpy['Occupation'].astype('category').cat.categories.tolist()
freq = pd.crosstab(adult_cpy['Occupation'], columns='proportion').apply(lambda r: r/len(adult_cpy), axis=1)
freq.head(len(labels))
labels = adult_known['Relationship'].astype('category').cat.categories.tolist()
freq = pd.crosstab(adult_known['Relationship'], columns='proportion').apply(lambda r: r/len(adult_known), axis=1)
freq.head(len(labels))
labels = adult_cpy['Relationship'].astype('category').cat.categories.tolist()
freq = pd.crosstab(adult_cpy['Relationship'], columns='proportion').apply(lambda r: r/len(adult_cpy), axis=1)
freq.head(len(labels))
labels = adult_known['Race'].astype('category').cat.categories.tolist()
freq = pd.crosstab(adult_known['Race'], columns='proportion').apply(lambda r: r/len(adult_known), axis=1)
freq.head(len(labels))
labels = adult_cpy['Race'].astype('category').cat.categories.tolist()
freq = pd.crosstab(adult_cpy['Race'], columns='proportion').apply(lambda r: r/len(adult_cpy), axis=1)
freq.head(len(labels))
labels = adult_known['Gender'].astype('category').cat.categories.tolist()
freq = pd.crosstab(adult_known['Gender'], columns='proportion').apply(lambda r: r/len(adult_known), axis=1)
freq.head(len(labels))
labels = adult_cpy['Gender'].astype('category').cat.categories.tolist()
freq = pd.crosstab(adult_cpy['Gender'], columns='proportion').apply(lambda r: r/len(adult_cpy), axis=1)
freq.head(len(labels))
labels = adult_known['Income'].astype('category').cat.categories.tolist()
freq = pd.crosstab(adult_known['Income'], columns='proportion').apply(lambda r: r/len(adult_known), axis=1)
freq.head(len(labels))
labels = adult_cpy['Income'].astype('category').cat.categories.tolist()
freq = pd.crosstab(adult_cpy['Income'], columns='proportion').apply(lambda r: r/len(adult_cpy), axis=1)
freq.head(len(labels))
adult_known['WorkClass_c'] = adult_known['WorkClass']
adult_known.head()
The distributions of variables did not change significantly when the rows with question marks were removed. Therefore, use the dataset with all known values to fit models.
adult_known.min()
adult_known.max()
# Create a numeric variable for WorkClass
labels = adult_known['WorkClass_c'].astype('category').cat.categories.tolist()
replace_map_comp = {'WorkClass_c' : {k: v for k,v in zip(labels,list(range(1,len(labels)+1)))}}
adult_known.replace(replace_map_comp, inplace=True)
# Create dummy variables of categorical variables
adult_known = pd.get_dummies(adult_known, columns=['WorkClass'])
adult_known = pd.get_dummies(adult_known, columns=['Education'])
adult_known = pd.get_dummies(adult_known, columns=['MaritalStatus'])
adult_known = pd.get_dummies(adult_known, columns=['Occupation'])
adult_known = pd.get_dummies(adult_known, columns=['Relationship'])
adult_known = pd.get_dummies(adult_known, columns=['Race'])
adult_known = pd.get_dummies(adult_known, columns=['Gender'])
adult_known = pd.get_dummies(adult_known, columns=['NativeCountry'])
adult_known = pd.get_dummies(adult_known, columns=['Income'])
adult_known['age_bins'] = pd.cut(x=adult_known['Age'], bins=[10, 19, 29, 39, 49, 59, 69, 79, 89, 99])
adult_known = pd.get_dummies(adult_known, columns=['age_bins'])
adult_known.head()
# Box-plot of 'fnlwgt' column - no transformation
sns.boxplot(x = adult_known['fnlwgt'])
# Box-plot of 'fnlwgt' column - transformed to resolve non-normality
sns.boxplot(x = adult_known['fnlwgt']**0.1)
# q-q plot for the 'fnlwgt' column - no transformation
qqplot(adult_known['fnlwgt'], line='s')
pyplot.show()
# q-q plot for the 'fnlwgt' column - transformed to resolve non-normality
adult_known['fnlwgt_t'] = adult_known['fnlwgt']**0.1
qqplot(adult_known['fnlwgt_t'], line='s')
pyplot.show()
adult_known.head()
# q-q plot for the 'CapitalGain' column - too many zero values
qqplot(adult_known['CapitalGain'], line='s')
pyplot.show()
# q-q plot for the 'CapitalGain' column
# Too many zero values to be transformed to resolve non-normality even with log(x+1)
adult_known['CapitalGain_t'] = np.log(adult_known['CapitalGain']+1)
qqplot(adult_known['CapitalGain_t'], line='s')
pyplot.show()
# q-q plot for the 'CapitalLoss' column - too many zero values to be transformed to resolve non-normality
qqplot(adult_known['CapitalLoss'], line='s')
pyplot.show()
# q-q plot for the 'CapitalLoss' column
# Too many zero values to be transformed to resolve non-normality even with log(x+1)
adult_known['CapitalLoss_t'] = np.log(adult_known['CapitalLoss']+1)
qqplot(adult_known['CapitalLoss_t'], line='s')
pyplot.show()
# q-q plot for difference between CapitalGain and CapitalLoss - transformed for normality
adult_known['CapitalDiff_t'] = (adult_known['CapitalGain'] - adult_known['CapitalLoss'] + 1)**10
qqplot(adult_known['CapitalDiff_t'], line='s')
pyplot.show()
# Box-plot of 'HoursPerWeek' column - no transformation - transformed to resolve non-normality
sns.boxplot(x = adult_known['HoursPerWeek']**0.75)
# q-q plot for the 'HoursPerWeek' column - transformed to resolve non-normality
adult_known['HoursPerWeek_t'] = adult_known['HoursPerWeek']**0.75
qqplot(adult_known['HoursPerWeek_t']**0.75, line='s')
pyplot.show()
adult_known.head()
# Box-plot of 'Age' column - no transformation
sns.boxplot(x = adult_known['Age'])
# Box-plot of 'Age' column - transformed to resolve non-normality
sns.boxplot(x = np.log(adult_known['Age']))
# q-q plot for the 'Age' column - no transformation
qqplot(adult_known['Age'], line='s')
pyplot.show()
# q-q plot for the 'Age' column - transformed but this graph does not show resolved non-normality
adult_known['Age_t'] = np.log(adult_known['Age'])
qqplot(adult_known['Age_t']**0.1, line='s')
pyplot.show()
adult_known.head()
The CaptialGain and CapitalLoss variables are not included in the Gaussian model below because they do not meet the assumption of normality. Even with the log(x+1) transformation, these variables did not become more normally distributed. Even with the assumption violation, these variables, along with a transformation of the difference between these two values, were tried in the model but none of them yielded any improvement.
The multinomial Gaussian naive Bayes model with 'Age_t', 'fnlwgt_t', and 'HoursPerWeek_t' has a high accuracy for both train and test, 0.7369 and 0.7236, respectively. However, this model only predicts classes 3, 4, and 6. This model was aborted in favor of the multinomial Bernoulli naive Bayes. See below for that model and the answers to the homework questions.
# Obtain distribution of WorkClass_c for use as priors in models
labels = adult_known['WorkClass_c'].astype('category').cat.categories.tolist()
freq = pd.crosstab(adult_known['WorkClass_c'], columns='proportion').apply(lambda r: r/len(adult_known), axis=1)
freq.head(len(labels))
# Create a Gaussian Naive Bayes model
gauss_xvars = adult_known[['Age_t', 'fnlwgt_t', 'HoursPerWeek_t']]
target = adult_known[['WorkClass_c']]
# Scale and split into train and test sets
x_train, x_test, y_train, y_test = train_test_split(preprocessing.scale(gauss_xvars),
target,
test_size=0.2,
random_state=1)
mnbg_clf = GaussianNB(
#class_prior = [0.0313, 0.0685, 0.0002, 0.7387, 0.0356, 0.0828, 0.0424, 0.0005]
)
mnb_gmodel = mnbg_clf.fit(x_train, y_train)
y_pred_train = y_train.copy()
y_pred_train['CLASS_PREDS'] = mnb_gmodel.predict(x_train)
pd.crosstab(y_pred_train['WorkClass_c'], y_pred_train['CLASS_PREDS'],
rownames=['True'], colnames=['Predicted'], margins=True)
# Predict probabilities with the multinomial model
train_df = pd.DataFrame(mnb_gmodel.predict_proba(x_train))
train_df.columns = ['prob' + str(col) for col in train_df.columns]
train_df.head()
y_pred_test = y_test
y_pred_test['CLASS_PREDS'] = mnb_gmodel.predict(x_test)
pd.crosstab(y_pred_test['WorkClass_c'], y_pred_test['CLASS_PREDS'],
rownames=['True'], colnames=['Predicted'], margins=True)
# Predict probabilities with the multinomial model
test_df = pd.DataFrame(mnb_gmodel.predict_proba(x_test))
test_df.columns = ['prob' + str(col) for col in test_df.columns]
test_df.head()
accuracy_score(y_pred_train['WorkClass_c'], y_pred_train['CLASS_PREDS'])
accuracy_score(y_pred_test['WorkClass_c'], y_pred_test['CLASS_PREDS'])
0.7361 - 0.7231
The accuracy scores for the Bernoulli model, 0.6341 and 0.6283, for train and test, respectively. This model predicts classes 1, 2, 4, 5, 6, and 7. A model with Age binned and converted to dummy variables resulted in a slight decrease in the training and test accurracy, 0.6253 and 0.6155, respectively. This model with age predicted classes 1, 2, 4, 5, 6, 7, and 8. Because of the slightly better accuracy and adequate number of classes being predicted, the model chosen is the model without Age.
#list(adult_known.columns.values.tolist())
# Create a Bernoulli Naive Bayes model
target = adult_known[['WorkClass_c']]
bern_xvars = adult_known[['Education_10th',
'Education_11th',
'Education_12th',
'Education_1st-4th',
'Education_5th-6th',
'Education_7th-8th',
'Education_9th',
'Education_Assoc-acdm',
'Education_Assoc-voc',
'Education_Bachelors',
'Education_Doctorate',
'Education_HS-grad',
'Education_Masters',
'Education_Preschool',
'Education_Prof-school',
'Education_Some-college',
'MaritalStatus_Divorced',
'MaritalStatus_Married-AF-spouse',
'MaritalStatus_Married-civ-spouse',
'MaritalStatus_Married-spouse-absent',
'MaritalStatus_Never-married',
'MaritalStatus_Separated',
'MaritalStatus_Widowed',
'Occupation_Adm-clerical',
'Occupation_Armed-Forces',
'Occupation_Craft-repair',
'Occupation_Exec-managerial',
'Occupation_Farming-fishing',
'Occupation_Handlers-cleaners',
'Occupation_Machine-op-inspct',
'Occupation_Never-worked',
'Occupation_Other-service',
'Occupation_Priv-house-serv',
'Occupation_Prof-specialty',
'Occupation_Protective-serv',
'Occupation_Sales',
'Occupation_Tech-support',
'Occupation_Transport-moving',
'Relationship_Husband',
'Relationship_Not-in-family',
'Relationship_Other-relative',
'Relationship_Own-child',
'Relationship_Unmarried',
'Relationship_Wife',
'Race_Amer-Indian-Eskimo',
'Race_Asian-Pac-Islander',
'Race_Black',
'Race_Other',
'Race_White',
'Gender_Female',
'Gender_Male',
'NativeCountry_Cambodia',
'NativeCountry_Canada',
'NativeCountry_China',
'NativeCountry_Columbia',
'NativeCountry_Cuba',
'NativeCountry_Dominican-Republic',
'NativeCountry_Ecuador',
'NativeCountry_El-Salvador',
'NativeCountry_England',
'NativeCountry_France',
'NativeCountry_Germany',
'NativeCountry_Greece',
'NativeCountry_Guatemala',
'NativeCountry_Haiti',
'NativeCountry_Holand-Netherlands',
'NativeCountry_Honduras',
'NativeCountry_Hong',
'NativeCountry_Hungary',
'NativeCountry_India',
'NativeCountry_Iran',
'NativeCountry_Ireland',
'NativeCountry_Italy',
'NativeCountry_Jamaica',
'NativeCountry_Japan',
'NativeCountry_Laos',
'NativeCountry_Mexico',
'NativeCountry_Nicaragua',
'NativeCountry_Outlying-US(Guam-USVI-etc)',
'NativeCountry_Peru',
'NativeCountry_Philippines',
'NativeCountry_Poland',
'NativeCountry_Portugal',
'NativeCountry_Puerto-Rico',
'NativeCountry_Scotland',
'NativeCountry_South',
'NativeCountry_Taiwan',
'NativeCountry_Thailand',
'NativeCountry_Trinadad&Tobago',
'NativeCountry_United-States',
'NativeCountry_Vietnam',
'NativeCountry_Yugoslavia',
'Income_<=50K',
'Income_>50K'
#'age_bins_(10, 19]',
#'age_bins_(19, 29]',
#'age_bins_(29, 39]',
#'age_bins_(39, 49]',
#'age_bins_(49, 59]',
#'age_bins_(59, 69]',
#'age_bins_(69, 79]',
#'age_bins_(79, 89]',
#'age_bins_(89, 99]'
]]
x_train, x_test, y_train, y_test = train_test_split(preprocessing.scale(bern_xvars),
target,
test_size=0.2,
random_state=1)
# Scaling the binomial variables did not change the model accuracy
x_train, x_test, y_train, y_test = train_test_split(bern_xvars,
target,
test_size=0.2,
random_state=1)
mnbb_clf = BernoulliNB(
class_prior = [0.0313, 0.0685, 0.0002, 0.7387, 0.0356, 0.0828, 0.0424, 0.0005]
)
mnbb_model = mnbb_clf.fit(x_train, y_train)
y_mnbb_train = y_train.copy()
y_mnbb_train['CLASS_PREDS'] = mnbb_model.predict(x_train)
pd.crosstab(y_mnbb_train['WorkClass_c'], y_mnbb_train['CLASS_PREDS'],
rownames=['True'], colnames=['Predicted'], margins=True)
# Predict probabilities with the multinomial model
bbayes_train = pd.DataFrame(mnbb_model.predict_proba(x_train))
bbayes_train.columns = ['prob' + str(col) for col in bbayes_train.columns]
bbayes_train.head()
confusion_matrix(y_mnbb_train['WorkClass_c'], y_mnbb_train['CLASS_PREDS'])
y_mnbb_test = y_test.copy()
y_mnbb_test['CLASS_PREDS'] = mnbb_model.predict(x_test)
pd.crosstab(y_mnbb_test['WorkClass_c'], y_mnbb_test['CLASS_PREDS'],
rownames=['True'], colnames=['Predicted'], margins=True)
# Predict probabilities with the Bernoulli model
bbayes_test = pd.DataFrame(mnbb_model.predict_proba(x_test))
bbayes_test.columns = ['prob' + str(col) for col in bbayes_test.columns]
bbayes_test.head()
confusion_matrix(y_mnbb_test['WorkClass_c'], y_mnbb_test['CLASS_PREDS'])
accuracy_score(y_mnbb_train['WorkClass_c'], y_mnbb_train['CLASS_PREDS'])
accuracy_score(y_mnbb_test['WorkClass_c'], y_mnbb_test['CLASS_PREDS'])
0.6341-0.6283
y_mnbb_test['WC1'] = np.where(y_mnbb_test['WorkClass_c'] == 1, 1, 0)
y_mnbb_test['WC2'] = np.where(y_mnbb_test['WorkClass_c'] == 2, 1, 0)
y_mnbb_test['WC3'] = np.where(y_mnbb_test['WorkClass_c'] == 3, 1, 0)
y_mnbb_test['WC4'] = np.where(y_mnbb_test['WorkClass_c'] == 4, 1, 0)
y_mnbb_test['WC5'] = np.where(y_mnbb_test['WorkClass_c'] == 5, 1, 0)
y_mnbb_test['WC6'] = np.where(y_mnbb_test['WorkClass_c'] == 6, 1, 0)
y_mnbb_test['WC7'] = np.where(y_mnbb_test['WorkClass_c'] == 7, 1, 0)
y_mnbb_test['WC8'] = np.where(y_mnbb_test['WorkClass_c'] == 8, 1, 0)
y_mnbb_test.head()
# Visualize these ROC curves with matplotlib
plt.plot(roc_curve(y_mnbb_test['WC1'], bbayes_test['prob0'])[0],roc_curve(y_mnbb_test['WC1'], bbayes_test['prob0'])[1],
color = 'blue', label='WorkClass: 1 ROC Curve (area = %0.5f)' % roc_auc_score(y_mnbb_test['WC1'], bbayes_test['prob0']))
plt.plot(roc_curve(y_mnbb_test['WC2'], bbayes_test['prob1'])[0],roc_curve(y_mnbb_test['WC2'], bbayes_test['prob1'])[1],
color = 'blue', label='WorkClass: 2 ROC Curve (area = %0.5f)' % roc_auc_score(y_mnbb_test['WC2'], bbayes_test['prob0']))
plt.plot(roc_curve(y_mnbb_test['WC3'], bbayes_test['prob2'])[0],roc_curve(y_mnbb_test['WC3'], bbayes_test['prob2'])[1],
color = 'blue', label='WorkClass: 3 ROC Curve (area = %0.5f)' % roc_auc_score(y_mnbb_test['WC3'], bbayes_test['prob0']))
plt.plot(roc_curve(y_mnbb_test['WC4'], bbayes_test['prob3'])[0],roc_curve(y_mnbb_test['WC4'], bbayes_test['prob3'])[1],
color = 'red', label='WorkClass: 4 ROC Curve (area = %0.5f)' % roc_auc_score(y_mnbb_test['WC4'], bbayes_test['prob1']))
plt.plot(roc_curve(y_mnbb_test['WC5'], bbayes_test['prob4'])[0],roc_curve(y_mnbb_test['WC5'], bbayes_test['prob4'])[1],
color = 'green', label='WorkClass: 5 ROC Curve (area = %0.5f)' % roc_auc_score(y_mnbb_test['WC5'], bbayes_test['prob2']))
plt.plot(roc_curve(y_mnbb_test['WC6'], bbayes_test['prob5'])[0],roc_curve(y_mnbb_test['WC6'], bbayes_test['prob5'])[1],
color = 'purple', label='WorkClass: 6 ROC Curve (area = %0.5f)' % roc_auc_score(y_mnbb_test['WC6'], bbayes_test['prob3']))
plt.plot(roc_curve(y_mnbb_test['WC7'], bbayes_test['prob6'])[0],roc_curve(y_mnbb_test['WC7'], bbayes_test['prob6'])[1],
color = 'orange', label='WorkClass: 7 ROC Curve (area = %0.5f)' % roc_auc_score(y_mnbb_test['WC7'], bbayes_test['prob4']))
plt.plot(roc_curve(y_mnbb_test['WC8'], bbayes_test['prob7'])[0],roc_curve(y_mnbb_test['WC8'], bbayes_test['prob7'])[1],
color = 'brown', label='WorkClass: 8 ROC Curve (area = %0.5f)' % roc_auc_score(y_mnbb_test['WC8'], bbayes_test['prob5']))
plt.plot([0, 2], [0, 2], color='black', linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.xlabel('False Positive Rate (1 - Specificity)')
plt.ylabel('True Positive Rate (Sensitivity)')
plt.title('Multinomial Bayes AUC')
plt.legend()
plt.show()
sns.set(style='white', rc={'figure.figsize':(10,10)})
The multinomial logistic regression model has a higher accuracy on both train and test, 0.7496 and 0.7454, respectively, than the multinomial naive Bayes model with train and test accuracy 0.6341 and 0.6283, respectively. The multinomial logistic regression model predicts classes 1 through 7 on train and 2 through 7 on test. This model did not include binned Age.
Note: When the binned Age variable was included as dummy variables, the accuracy increased slightly for train and test at 0.7506 and 0.7476, respectively. The predicted classes between train and test changed only slightly with test predicting classes 2 through 6.
mlr_target = adult_known[['WorkClass_c']]
mlr_xvars = adult_known[['Education_10th',
'Education_11th',
'Education_12th',
'Education_1st-4th',
'Education_5th-6th',
'Education_7th-8th',
'Education_9th',
'Education_Assoc-acdm',
'Education_Assoc-voc',
'Education_Bachelors',
'Education_Doctorate',
'Education_HS-grad',
'Education_Masters',
'Education_Preschool',
'Education_Prof-school',
'Education_Some-college',
'MaritalStatus_Divorced',
'MaritalStatus_Married-AF-spouse',
'MaritalStatus_Married-civ-spouse',
'MaritalStatus_Married-spouse-absent',
'MaritalStatus_Never-married',
'MaritalStatus_Separated',
'MaritalStatus_Widowed',
'Occupation_Adm-clerical',
'Occupation_Armed-Forces',
'Occupation_Craft-repair',
'Occupation_Exec-managerial',
'Occupation_Farming-fishing',
'Occupation_Handlers-cleaners',
'Occupation_Machine-op-inspct',
'Occupation_Never-worked',
'Occupation_Other-service',
'Occupation_Priv-house-serv',
'Occupation_Prof-specialty',
'Occupation_Protective-serv',
'Occupation_Sales',
'Occupation_Tech-support',
'Occupation_Transport-moving',
'Relationship_Husband',
'Relationship_Not-in-family',
'Relationship_Other-relative',
'Relationship_Own-child',
'Relationship_Unmarried',
'Relationship_Wife',
'Race_Amer-Indian-Eskimo',
'Race_Asian-Pac-Islander',
'Race_Black',
'Race_Other',
'Race_White',
'Gender_Female',
'Gender_Male',
'NativeCountry_Cambodia',
'NativeCountry_Canada',
'NativeCountry_China',
'NativeCountry_Columbia',
'NativeCountry_Cuba',
'NativeCountry_Dominican-Republic',
'NativeCountry_Ecuador',
'NativeCountry_El-Salvador',
'NativeCountry_England',
'NativeCountry_France',
'NativeCountry_Germany',
'NativeCountry_Greece',
'NativeCountry_Guatemala',
'NativeCountry_Haiti',
'NativeCountry_Holand-Netherlands',
'NativeCountry_Honduras',
'NativeCountry_Hong',
'NativeCountry_Hungary',
'NativeCountry_India',
'NativeCountry_Iran',
'NativeCountry_Ireland',
'NativeCountry_Italy',
'NativeCountry_Jamaica',
'NativeCountry_Japan',
'NativeCountry_Laos',
'NativeCountry_Mexico',
'NativeCountry_Nicaragua',
'NativeCountry_Outlying-US(Guam-USVI-etc)',
'NativeCountry_Peru',
'NativeCountry_Philippines',
'NativeCountry_Poland',
'NativeCountry_Portugal',
'NativeCountry_Puerto-Rico',
'NativeCountry_Scotland',
'NativeCountry_South',
'NativeCountry_Taiwan',
'NativeCountry_Thailand',
'NativeCountry_Trinadad&Tobago',
'NativeCountry_United-States',
'NativeCountry_Vietnam',
'NativeCountry_Yugoslavia',
'Income_<=50K',
'Income_>50K'
#'age_bins_(10, 19]',
#'age_bins_(19, 29]',
#'age_bins_(29, 39]',
#'age_bins_(39, 49]',
#'age_bins_(49, 59]',
#'age_bins_(59, 69]',
#'age_bins_(69, 79]',
#'age_bins_(79, 89]',
#'age_bins_(89, 99]'
]]
x_train, x_test, y_train, y_test = train_test_split(mlr_xvars,
mlr_target,
test_size=0.2,
random_state=1)
#mnlr = LogisticRegression(random_state=0, multi_class='multinomial', solver='newton-cg')
# Multinomial logistic regression with ridge
mnlr = LogisticRegression(random_state=0, multi_class='multinomial', solver='newton-cg', penalty = 'l2',
C = 2, max_iter = 100)
mnlr_model = mnlr.fit(x_train, y_train)
y_mnlr_train = y_train.copy()
y_mnlr_train['CLASS_PREDS'] = mnlr_model.predict(x_train)
pd.crosstab(y_mnlr_train['WorkClass_c'], y_mnlr_train['CLASS_PREDS'],
rownames=['True'], colnames=['Predicted'], margins=True)
# Predict probabilities with the multinomial logistic model
mnlr_train = pd.DataFrame(mnlr_model.predict_proba(x_train))
mnlr_train.columns = ['prob' + str(col) for col in mnlr_train.columns]
mnlr_train.head()
y_mnlr_test = y_test.copy()
y_mnlr_test['CLASS_PREDS'] = mnlr_model.predict(x_test)
pd.crosstab(y_mnlr_test['WorkClass_c'], y_mnlr_test['CLASS_PREDS'],
rownames=['True'], colnames=['Predicted'], margins=True)
# Predict probabilities with the multinomial logistic model
mnlr_test = pd.DataFrame(mnlr_model.predict_proba(x_test))
mnlr_test.columns = ['prob' + str(col) for col in mnlr_test.columns]
mnlr_test.head()
accuracy_score(y_mnlr_train['WorkClass_c'], y_mnlr_train['CLASS_PREDS'])
accuracy_score(y_mnlr_test['WorkClass_c'], y_mnlr_test['CLASS_PREDS'])
y_mnlr_test['WC1'] = np.where(y_mnlr_test['WorkClass_c'] == 1, 1, 0)
y_mnlr_test['WC2'] = np.where(y_mnlr_test['WorkClass_c'] == 2, 1, 0)
y_mnlr_test['WC3'] = np.where(y_mnlr_test['WorkClass_c'] == 3, 1, 0)
y_mnlr_test['WC4'] = np.where(y_mnlr_test['WorkClass_c'] == 4, 1, 0)
y_mnlr_test['WC5'] = np.where(y_mnlr_test['WorkClass_c'] == 5, 1, 0)
y_mnlr_test['WC6'] = np.where(y_mnlr_test['WorkClass_c'] == 6, 1, 0)
y_mnlr_test['WC7'] = np.where(y_mnlr_test['WorkClass_c'] == 7, 1, 0)
y_mnlr_test['WC8'] = np.where(y_mnlr_test['WorkClass_c'] == 8, 1, 0)
# Visualize these ROC curves with matplotlib
plt.plot(roc_curve(y_mnlr_test['WC1'], mnlr_test['prob0'])[0],roc_curve(y_mnlr_test['WC1'], mnlr_test['prob0'])[1],
color = 'blue', label='WorkClass: 1 ROC Curve (area = %0.5f)' % roc_auc_score(y_mnlr_test['WC1'], mnlr_test['prob0']))
plt.plot(roc_curve(y_mnlr_test['WC2'], mnlr_test['prob1'])[0],roc_curve(y_mnlr_test['WC2'], mnlr_test['prob1'])[1],
color = 'blue', label='WorkClass: 2 ROC Curve (area = %0.5f)' % roc_auc_score(y_mnlr_test['WC2'], mnlr_test['prob0']))
plt.plot(roc_curve(y_mnlr_test['WC3'], mnlr_test['prob2'])[0],roc_curve(y_mnlr_test['WC3'], mnlr_test['prob2'])[1],
color = 'blue', label='WorkClass: 3 ROC Curve (area = %0.5f)' % roc_auc_score(y_mnlr_test['WC3'], mnlr_test['prob0']))
plt.plot(roc_curve(y_mnlr_test['WC4'], mnlr_test['prob3'])[0],roc_curve(y_mnlr_test['WC4'], mnlr_test['prob3'])[1],
color = 'red', label='WorkClass: 4 ROC Curve (area = %0.5f)' % roc_auc_score(y_mnlr_test['WC4'], mnlr_test['prob1']))
plt.plot(roc_curve(y_mnlr_test['WC5'], mnlr_test['prob4'])[0],roc_curve(y_mnlr_test['WC5'], mnlr_test['prob4'])[1],
color = 'green', label='WorkClass: 5 ROC Curve (area = %0.5f)' % roc_auc_score(y_mnlr_test['WC5'], mnlr_test['prob2']))
plt.plot(roc_curve(y_mnlr_test['WC6'], mnlr_test['prob5'])[0],roc_curve(y_mnlr_test['WC6'], mnlr_test['prob5'])[1],
color = 'purple', label='WorkClass: 6 ROC Curve (area = %0.5f)' % roc_auc_score(y_mnlr_test['WC6'], mnlr_test['prob3']))
plt.plot(roc_curve(y_mnlr_test['WC7'], mnlr_test['prob6'])[0],roc_curve(y_mnlr_test['WC7'], mnlr_test['prob6'])[1],
color = 'orange', label='WorkClass: 7 ROC Curve (area = %0.5f)' % roc_auc_score(y_mnlr_test['WC7'], mnlr_test['prob4']))
plt.plot(roc_curve(y_mnlr_test['WC8'], mnlr_test['prob7'])[0],roc_curve(y_mnlr_test['WC8'], mnlr_test['prob7'])[1],
color = 'brown', label='WorkClass: 8 ROC Curve (area = %0.5f)' % roc_auc_score(y_mnlr_test['WC8'], mnlr_test['prob5']))
plt.plot([0, 2], [0, 2], color='black', linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.xlabel('False Positive Rate (1 - Specificity)')
plt.ylabel('True Positive Rate (Sensitivity)')
plt.title('Multinomial Logistic AUC')
plt.legend()
plt.show()
sns.set(style='white', rc={'figure.figsize':(10,10)})
The ROC curves show how well the model does in terms of the sensitivity and specificity. Sensitivity is represented on the y-axis, and the complement of specificity (1 - specificity) is represented on the x-axis. As sensitivity increases, specificity decreases so a model that is highly accurate will have an ROC curve that closely follows the left and top margins of the graph. The area under the curve (AUC) is a measure used to evaluate the model: a high AUC indicates high sensitivity and low specificity, which in turn indicates an accurate model.
In this multinomial model, each curve represents the trade-off between sensitivity and specificity for each WorkClass_c outcome. According to the graph, the green line (WorkClass_c = 5) appears to have the largest area because it follows the left and top margins better than the other curves. However, the AUC for the green line, 0.3035, is the lowest of all of the AUC measures.
The Random Forest Classifier method, without parameter tuning, overfit the data with a train and test accuracy of 0.9248 and 0.8560, respectively. After using RandomizedSearchCV to choose the hyperparamter values, the accuracy for train dropped slightly to 0.8991, but the accuracy for test jumped up to 0.8810. When the binned Age dummy variables were included, the accuracy on train and test, with hyperparameter tuning, was 0.9139 and 0.8903, respectively.
The hyperparamter 'min_samples_split' effected the accuracy when changed from 10, the value selected by RandomizedSearchCV, to 20, 50, and 100. The difference between the train and test accuracy values stayed between 0.01 and 0.02, however, there was a drop in accuracy.
min_samples_split=20 train accuracy: 0.8811 test accuracy: 0.8791
min_samples_split=50 train accuracy: 0.9027 test accuracy: 0.8907
min_samples_split=100 train accuracy: 0.8969 test accuracy: 0.8898
One interesting thing about the above values is that, of the three examples, the hyperparameter value of 50 has a train and test accuracy that is higher than the respective train and test accuracy values produced by the hyperparameter values 20 and 100.
#list(adult_known.columns.values.tolist())
rfor_target = adult_known[['Income_>50K']]
rfor_xvars = adult_known[['WorkClass_Federal-gov',
'WorkClass_Local-gov',
'WorkClass_Never-worked',
'WorkClass_Private',
'WorkClass_Self-emp-inc',
'WorkClass_Self-emp-not-inc',
'WorkClass_State-gov',
'WorkClass_Without-pay',
'Education_10th',
'Education_11th',
'Education_12th',
'Education_1st-4th',
'Education_5th-6th',
'Education_7th-8th',
'Education_9th',
'Education_Assoc-acdm',
'Education_Assoc-voc',
'Education_Bachelors',
'Education_Doctorate',
'Education_HS-grad',
'Education_Masters',
'Education_Preschool',
'Education_Prof-school',
'Education_Some-college',
'MaritalStatus_Divorced',
'MaritalStatus_Married-AF-spouse',
'MaritalStatus_Married-civ-spouse',
'MaritalStatus_Married-spouse-absent',
'MaritalStatus_Never-married',
'MaritalStatus_Separated',
'MaritalStatus_Widowed',
'Occupation_Adm-clerical',
'Occupation_Armed-Forces',
'Occupation_Craft-repair',
'Occupation_Exec-managerial',
'Occupation_Farming-fishing',
'Occupation_Handlers-cleaners',
'Occupation_Machine-op-inspct',
'Occupation_Never-worked',
'Occupation_Other-service',
'Occupation_Priv-house-serv',
'Occupation_Prof-specialty',
'Occupation_Protective-serv',
'Occupation_Sales',
'Occupation_Tech-support',
'Occupation_Transport-moving',
'Relationship_Husband',
'Relationship_Not-in-family',
'Relationship_Other-relative',
'Relationship_Own-child',
'Relationship_Unmarried',
'Relationship_Wife',
'Race_Amer-Indian-Eskimo',
'Race_Asian-Pac-Islander',
'Race_Black',
'Race_Other',
'Race_White',
'Gender_Female',
'Gender_Male',
'NativeCountry_Cambodia',
'NativeCountry_Canada',
'NativeCountry_China',
'NativeCountry_Columbia',
'NativeCountry_Cuba',
'NativeCountry_Dominican-Republic',
'NativeCountry_Ecuador',
'NativeCountry_El-Salvador',
'NativeCountry_England',
'NativeCountry_France',
'NativeCountry_Germany',
'NativeCountry_Greece',
'NativeCountry_Guatemala',
'NativeCountry_Haiti',
'NativeCountry_Holand-Netherlands',
'NativeCountry_Honduras',
'NativeCountry_Hong',
'NativeCountry_Hungary',
'NativeCountry_India',
'NativeCountry_Iran',
'NativeCountry_Ireland',
'NativeCountry_Italy',
'NativeCountry_Jamaica',
'NativeCountry_Japan',
'NativeCountry_Laos',
'NativeCountry_Mexico',
'NativeCountry_Nicaragua',
'NativeCountry_Outlying-US(Guam-USVI-etc)',
'NativeCountry_Peru',
'NativeCountry_Philippines',
'NativeCountry_Poland',
'NativeCountry_Portugal',
'NativeCountry_Puerto-Rico',
'NativeCountry_Scotland',
'NativeCountry_South',
'NativeCountry_Taiwan',
'NativeCountry_Thailand',
'NativeCountry_Trinadad&Tobago',
'NativeCountry_United-States',
'NativeCountry_Vietnam',
'NativeCountry_Yugoslavia',
'age_bins_(10, 19]',
'age_bins_(19, 29]',
'age_bins_(29, 39]',
'age_bins_(39, 49]',
'age_bins_(49, 59]',
'age_bins_(59, 69]',
'age_bins_(69, 79]',
'age_bins_(79, 89]',
'age_bins_(89, 99]'
]]
x_train, x_test, y_train, y_test = train_test_split(rfor_xvars,
rfor_target,
test_size=0.2,
random_state=1)
freq = pd.crosstab(adult_known['Income_>50K'], columns='count')
freq
# Obtain distribution of Income
labels = adult_known['Income_>50K'].astype('category').cat.categories.tolist()
freq = pd.crosstab(adult_known['Income_>50K'], columns='proportion').apply(lambda r: r/len(adult_known), axis=1)
freq.head(len(labels))
# Random forest classifier
rf_clf = RandomForestClassifier(random_state = 0, n_jobs = -1)
rf_clf_model = rf_clf.fit(x_train, y_train)
rf_train_probs = rf_clf_model.predict_proba(x_train)
rf_test_probs = rf_clf_model.predict_proba(x_test)
rf_train_prob_cols = ['prob0', 'prob1']
rf_preds_train = pd.DataFrame(rf_train_probs, columns=rf_train_prob_cols)
rf_preds_test = pd.DataFrame(rf_test_probs, columns=rf_train_prob_cols)
fpr, tpr, thresholds = metrics.roc_curve(y_train, rf_preds_train['prob1'])
metrics.auc(fpr, tpr)
fpr, tpr, thresholds = metrics.roc_curve(y_test, rf_preds_test['prob1'])
metrics.auc(fpr, tpr)
# Look at current hyperparameters
print('Parameters currently in use:\n')
pprint(rf_clf.get_params())
n_estimators = number of trees in the forest
max_features = max number of features considered for splitting a node
max_depth = max number of levels in each decision tree, None means expands until leaf purity or all leaves contain < min_samples_split samples
min_samples_split = minimum number of data points placed in a node before the node is split
min_samples_leaf = minimum number of samples required to be at a leaf node
bootstrap = method for sampling data points (with or without replacement)
class_weight = weights associated with classes in the form {class_label: weight}. If not given, all classes are supposed to have weight one
crterion = quality measure - gini or entropy
min_impurity_decrease = node will be split if this split induces a decrease of the impurity greater than or equal to this value
min_weight_fraction_leaf = minimum weighted fraction of the sum total of weights (of all the input samples) required to be at a leaf node
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt', 'log2']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Method of measuring quality
criterion = ['gini', 'entropy']
# Set minimum impurity threshold
min_impurity_decrease = [0.0, 0.5, 1.0, 2.0, 5.0]
#
min_weight_fraction_leaf = [0, 0.25, 0.5]
#
class_weight = [{0:0.5, 1:0.5}, {0:0.75, 1:0.25}]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
'max_features': max_features,
'max_depth': max_depth,
'min_samples_split': min_samples_split,
'min_samples_leaf': min_samples_leaf,
'bootstrap': bootstrap,
'criterion': criterion,
'min_impurity_decrease': min_impurity_decrease,
'min_weight_fraction_leaf': min_weight_fraction_leaf,
'class_weight': class_weight}
pprint(random_grid)
rf_clf = RandomForestClassifier()
# Random search of parameters, using 3 fold cross validation,
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf_clf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(x_train, y_train)
# View the best parameters
rf_random.best_params_
rf_clf = RandomForestClassifier(random_state = 0
, n_jobs = -1
,n_estimators = 1200
,criterion = 'entropy'
,max_features = 'auto'
, max_depth = 70
,min_samples_split = 100
,min_samples_leaf = 2
,min_weight_fraction_leaf = 0
,min_impurity_decrease = 0.0
,class_weight = {0: 0.5, 1: 0.5}
)
rf_clf = RandomForestClassifier(random_state = 0
, n_jobs = -1
,n_estimators = 1200
,criterion = 'entropy'
,max_features = 'auto'
, max_depth = 70
,min_samples_split = 10
,min_samples_leaf = 2
,min_weight_fraction_leaf = 0
,min_impurity_decrease = 0.0
,class_weight = {0: 0.5, 1: 0.5}
)
rf_clf_model = rf_clf.fit(x_train, y_train)
rf_train_probs = rf_clf_model.predict_proba(x_train)
rf_test_probs = rf_clf_model.predict_proba(x_test)
rf_train_prob_cols = ['prob0', 'prob1']
rf_preds_train = pd.DataFrame(rf_train_probs, columns=rf_train_prob_cols)
rf_preds_test = pd.DataFrame(rf_test_probs, columns=rf_train_prob_cols)
fpr, tpr, thresholds = metrics.roc_curve(y_train, rf_preds_train['prob1'])
metrics.auc(fpr, tpr)
fpr, tpr, thresholds = metrics.roc_curve(y_test, rf_preds_test['prob1'])
metrics.auc(fpr, tpr)
0.9139-0.8903
import warnings;
warnings.filterwarnings('ignore');
# Import libraries
import pandas as pd
import numpy as np
from sklearn.linear_model import LassoCV
from sklearn.linear_model import Lasso
import seaborn as sns
from matplotlib import pyplot
import matplotlib.pyplot as plt
import matplotlib
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler
import plotly.express as px
from sklearn import metrics
from sklearn.manifold import TSNE
import umap
# Read in csv file to dataframe
pd.set_option('display.max_columns', None)
pw = pd.read_csv('pitcher_war.csv')
# Display the head
pw.head()
pw['Team'].unique()
pw_ = pw[pw['Team'] == '- - -']
pw_
pw['Season'].unique()
pw__ = pw[pw['Name'] == 'David Price']
pw__
print(pw.dtypes)
pw.describe()
# Create a copy of the dataset
pw_cpy = pw.copy()
# Create correlation matrix to look at highly correlated continuous variables
pw_float_vars = pw_cpy.drop(['Name', 'Team', 'Season'], axis=1)
corr_matrix = pw_float_vars.corr().abs()
# Create correlation matrix to look at highly correlated continuous variables
pw_float_vars = pw_cpy[['H', 'BB', 'HBP', 'K/9', 'GB', 'FB', 'FIP', 'xFIP']]
corr_matrix = pw_float_vars.corr().abs()
# Create correlation matrix to look at highly correlated continuous variables
pw_float_vars = pw_cpy[['BB', 'HBP', 'GB', 'FB', 'BABIP', 'Balls', 'Strikes', 'Pitches', 'SO']]
corr_matrix = pw_float_vars.corr().abs()
# Create correlation matrix to look at highly correlated continuous variables
pw_float_vars = pw_cpy[['BB', 'HBP', 'GB', 'FB', 'BABIP', 'SO', 'WHIP', 'RE24']]
corr_matrix = pw_float_vars.corr().abs()
# Create correlation matrix to look at highly correlated continuous variables
pw_float_vars = pw_cpy[['BB', 'HBP', 'GB', 'FB', 'RE24']]
corr_matrix = pw_float_vars.corr().abs()
# Create correlation matrix to look at highly correlated continuous variables
pw_float_vars = pw_cpy[['HBP', 'GB', 'FB', 'RE24', 'tERA']]
corr_matrix = pw_float_vars.corr().abs()
# Create correlation matrix to look at highly correlated continuous variables
pw_float_vars = pw_cpy[['HBP', 'GB', 'FB', 'tERA']]
corr_matrix = pw_float_vars.corr().abs()
# Create correlation matrix to look at highly correlated continuous variables
pw_float_vars = pw_cpy[['FIP', 'xFIP', 'tERA']]
corr_matrix = pw_float_vars.corr().abs()
# Create correlation matrix to look at highly correlated continuous variables
pw_float_vars = pw_cpy[['HBP', 'GB', 'FB', 'tERA', 'IFFB']]
corr_matrix = pw_float_vars.corr().abs()
# Create correlation matrix to look at highly correlated continuous variables
pw_float_vars = pw_cpy[['HBP', 'GB', 'FB', 'SIERA', 'HR', 'ER']]
corr_matrix = pw_float_vars.corr().abs()
# Create correlation matrix to look at highly correlated continuous variables
pw_float_vars = pw_cpy[['HBP', 'GB', 'FB', 'SIERA', 'R', 'WAR']]
corr_matrix = pw_float_vars.corr().abs()
# Create correlation matrix to look at highly correlated continuous variables
pw_float_vars = pw_cpy[['HBP', 'GB', 'FB', 'SIERA', 'R']]
corr_matrix = pw_float_vars.corr().abs()
# Create correlation matrix to look at highly correlated continuous variables
pw_float_vars = pw_cpy[['HBP', 'GB', 'FB', 'SIERA', 'IP', 'GS']]
corr_matrix = pw_float_vars.corr().abs()
# Create correlation matrix to look at highly correlated continuous variables
pw_float_vars = pw_cpy[['HBP', 'GB', 'FB', 'SIERA', 'WP', 'BK', 'SO']]
corr_matrix = pw_float_vars.corr().abs()
# Create correlation matrix to look at highly correlated continuous variables
pw_float_vars = pw_cpy[['HBP', 'GB', 'FB', 'WP', 'BK', 'SO', 'IBB', 'LD',
'RS', 'IFH', 'BU', 'BUH', 'Starting', 'AVG']]
corr_matrix = pw_float_vars.corr().abs()
# Generate a mask for the upper triangle
mask = np.zeros_like(corr_matrix, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
# Select the upper part of the triangle
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))
# Generate a custom diverging colormap
cmap = sns.diverging_palette(220, 10, as_cmap=True)
# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr_matrix, mask=mask, cmap=cmap, vmax=1, center=0,
square=True, linewidths=.5, cbar_kws={"shrink": .5})
correlation = pw_float_vars.corr(method ='pearson')
correlation
Remove columns with NaN values and use only numeric variables.
pw_cpy = pw.copy()
# Drop columns with any NaN
null_cols = pw_cpy.columns[pw_cpy.isnull().any()]
pw_cpy.drop(null_cols, axis = 1, inplace = True)
pw_cpy.head()
# All numeric variables
all_num = pw_cpy.select_dtypes(include=['number'])
#list(all_num.columns.values.tolist())
pw_num_vars = pw_cpy[['Name', 'Team', 'Season', 'WAR', 'Strikes' ,'HBP', 'GB', 'FB', 'WP', 'BK',
'SO', 'IBB', 'LD', 'RS', 'IFH', 'BU', 'BUH', 'AVG', 'Starting']]
# Include only numeric columns except playerid
pw_num_vars = pw_cpy[['WAR'
,'W'
,'L'
,'SV'
,'G'
,'GS'
,'IP'
,'K/9'
,'BB/9'
,'HR/9'
,'BABIP'
#,'ERA'
,'FIP'
#,'xFIP'
,'Age'
#,'ERA.1'
,'CG'
,'ShO'
,'SV.1'
,'BS'
,'IP.1'
,'TBF'
,'H'
,'R'
,'ER'
,'HR'
,'BB'
,'IBB'
,'HBP'
,'WP'
,'BK'
,'SO'
,'GB'
,'FB'
,'LD'
,'IFFB'
,'Balls'
,'Strikes'
,'Pitches'
,'RS'
,'IFH'
,'BU'
,'BUH'
##,'K/9.1'
##,'BB/9.1'
,'K/BB'
,'H/9'
##,'HR/9.1'
,'AVG'
#,'WHIP'
##,'BABIP.1'
##,'FIP.1'
,'GB/FB'
,'Starting'
,'Start-IP'
#,'tERA'
##,'xFIP.1'
,'WPA'
#,'RE24'
,'REW'
,'pLI'
,'inLI'
,'gmLI'
,'exLI'
,'Pulls'
,'WPA/LI'
,'Clutch'
,'FBv'
,'wFB'
,'wFB/C'
,'HLD'
,'SD'
,'MD'
##,'ERA-'
,'FIP-'
##,'xFIP-'
#,'SIERA'
,'RS/9'
,'E-F'
,'Pace'
#,'RA9-WAR'
,'BIP-Wins'
,'LOB-Wins'
,'FDP-Wins'
,'kwERA'
,'Pace (pi)'
,'FRM'
##,'K/9+'
##,'BB/9+'
##,'K/BB+'
##,'H/9+'
##,'HR/9+'
##,'AVG+'
##,'WHIP+'
##,'BABIP+'
##,'LOB%+'
,'K%+'
,'BB%+'
,'LD%+'
,'GB%+'
,'FB%+'
,'HR/FB%+'
,'Pull%+'
,'Cent%+'
,'Oppo%+'
,'Soft%+'
,'Med%+'
,'Hard%+']]
pw_num_vars.head()
pw_num_vars.shape
K=3 is the best fit. K=3 is the 'elbow' of the arm in the SSE plot and visually provides the clearest separation of points (over 5, 8, 10 tried below).
# Scale and transform
mms = MinMaxScaler()
mms.fit(pw_num_vars)
data_transformed = mms.transform(pw_num_vars)
data_transformed.shape
pw_kmeans = pw_cpy.copy()
pw_kmeans.shape
# Evaluate k-means clustering with SSE
# k means determine k
Sum_of_squared_distances = []
K = range(1,15)
for k in K:
km = KMeans(n_clusters=k)
km = km.fit(data_transformed[ : , 1:83])
Sum_of_squared_distances.append(km.inertia_)
plt.plot(K, Sum_of_squared_distances, 'bx-')
plt.xlabel('k')
plt.ylabel('Sum_of_squared_distances')
plt.title('Elbow Method For Optimal k')
plt.show()
clusterer = KMeans(5, random_state=1)
clusterer = KMeans(8, random_state=1)
clusterer = KMeans(10, random_state=1)
# K=3 is the 'elbow' of the arm and provides the clearest separation of points
clusterer = KMeans(3, random_state=1)
# Fit clusterer
clusterer.fit(data_transformed[ : , 1:83])
data_transformed[ : , 1:83].shape
pw_kmeans.shape
# Predict values
pw_kmeans['clust_grp'] = clusterer.predict(data_transformed[ : , 1:100])
pw_kmeans.head()
pw_kmeans = pw_kmeans[(pw_kmeans['WAR'] >= 0)]
sns.lmplot(data = pw_kmeans, x = 'Strikes', y = 'WAR', hue = 'clust_grp', fit_reg = False)
# Recreate this scatterplot with plotly
fig = px.scatter(pw_kmeans, x="Strikes", y="WAR", color="clust_grp"
,size= 'HBP'
,hover_data=['Name'])
fig.show()
# Use numeric varibles except for playerid and season
tsne_embedded = TSNE(n_components = 2
,perplexity = 5
,learning_rate = 500
,n_iter = 1000
,n_iter_without_progress = 200
).fit_transform(data_transformed)
tsne_embedded.shape
tsne_embedded_df = pd.DataFrame(tsne_embedded)
tsne_embedded_df = tsne_embedded_df.add_prefix('tsne')
tsne_embedded_df.head()
pw_tsne = pd.concat([pw_cpy, tsne_embedded_df], axis = 1)
pw_tsne.head()
The t-SNE method did a fairly good job of separating pitchers with high WAR scores from those with lower scores. Most pitchers with high WAR scores, greater than 4, appear in the upper left side of the graph where the tsne0 value is less than -20 and the tsne1 values are average to high. Those pitchers with low scores, less than 3, appear in the upper and lower right side of the graph where the tsne0 value is greater than -20. For lower scoring pitchers, the tsne1 value does not seem to have an effect. Pitchers with scores between 3 and 4 appear scattered throughout the graph so these were not classified as well.
# Create scatterplot with plotly
fig = px.scatter(pw_tsne, x="tsne0", y="tsne1", color="WAR"
#,size= 'HBP'
,hover_data=['Name'])
fig.show()
reducer = umap.UMAP(n_neighbors = 5
,min_dist = 0.0
,n_components = 15
,metric = 'euclidean'
)
umap_embedding = reducer.fit_transform(data_transformed)
umap_embedding.shape
umap_embedding_df = pd.DataFrame(umap_embedding)
umap_embedding_df = umap_embedding_df.add_prefix('umap')
umap_embedding_df.head()
pw_umap = pd.concat([pw_cpy, umap_embedding_df], axis = 1)
%matplotlib inline
sns.set(style='white', rc={'figure.figsize':(25,25)})
#plt.scatter(umap_embedding[:,0], umap_embedding[:,1], c = data_transformed[ : , 0])
# Create scatterplot with plotly
fig = px.scatter(pw_umap, x="umap0", y="umap1", color="WAR"
#,size= 'HBP'
,hover_data=['Name'])
fig.show()
reducer = umap.UMAP(n_neighbors = 5
,min_dist = 0.0
,n_components = 15
,metric = 'euclidean'
)
data_transformed.shape
data_transformed.dtype
# let's perform UMAP with a target
umap_embedding = umap.UMAP().fit_transform(data_transformed[ : , 1:100], y=data_transformed[ : , 0])
umap_embedding.shape
umap_embedding
umap_embedding_df = pd.DataFrame(umap_embedding)
umap_embedding_df = umap_embedding_df.add_prefix('umap')
umap_embedding_df.head()
pw_umap = pd.concat([pw_cpy, umap_embedding_df], axis = 1)
%matplotlib inline
sns.set(style='white', rc={'figure.figsize':(25,25)})
#plt.scatter(umap_embedding[:,0], umap_embedding[:,1], c = data_transformed[ : , 0])
The supervised UMAP method seems to mirror the unsupervised umap and t-SNE methods with high WAR scores in the upper and lower right portion of the graph. These high scores are associated with a high umap0 value and a low or average umap1 value. Pitchers with scores below 3 appear toward the left where the umap0 value is above average but below the umap0 value for high WAR scores. Most pitchers with WAR scores between 3 and 4 appear together in between the low and the high, although it is difficult to see. Hovering over the dots helps to show this. The second graph shows the distinction between the high and the low a little better than the first graph. You can see a band of light yellow and orange dots between the darker red/orange and green/blue/purple dots.
# Create scatterplot with plotly
fig = px.scatter(pw_umap, x="umap0", y="umap1", color="WAR"
#,size= 'HBP'
,hover_data=['Name'])
fig.show()
classes = [
'1',
'2',
'3',
'4',
'5',
'6',
'7',
'8',
'9',
'10']
fig, ax = plt.subplots(1, figsize=(14, 10))
plt.scatter(*umap_embedding.T, s=30.0, c=pw_umap['WAR'], cmap='Spectral', alpha=1.0)
plt.setp(ax, xticks=[], yticks=[])
cbar = plt.colorbar(boundaries=np.arange(11)-0.5)
cbar.set_ticks(np.arange(10))
cbar.set_ticklabels(classes)
plt.title('Fashion MNIST Embedded via UMAP using Labels');